Import des librairies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.image import imread

%matplotlib inline
pd.set_option('display.max_columns', 100)

import missingno as msno
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly

import cv2
import os
from os import path

from PIL import Image

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics 

from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras import optimizers

from keras.applications.vgg16 import VGG16, decode_predictions
from keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from keras.applications.vgg16 import preprocess_input

import time
import shutil 

import warnings
warnings.filterwarnings('ignore')

Import dataset

In [2]:
data = pd.read_csv("Flipkart/data_cleaned.csv")

Description jeu de données

In [3]:
print("Le jeu de données flipkart_com-ecommerce_sample contient %d lignes et %d colonnes." % (data.shape[0], data.shape[1]))
Le jeu de données flipkart_com-ecommerce_sample contient 1050 lignes et 35 colonnes.
In [4]:
data.head()
Out[4]:
Unnamed: 0 uniq_id crawl_timestamp product_url product_name pid retail_price discounted_price image is_FK_Advantage_product description product_rating brand product_specifications product_category_1 product_category_2 product_category_3 tokenized stopwords_removed porter_stemmed lemmatize cluster cvec_lem_pca_tsne cluster TFIDF_lem_PCA cluster CVec_lem_LDA cluster TFIDF_lem_LDA cluster CountVec_stem_PCA cluster TFIDF_stem_PCA cluster countVec_stem_LDA cluster TFIDF_stem_LDA cluster word2vec_lem cluster word2vec_stemmed cluster BERT_lem cluster BERT_stem cluster USE_lem cluster USE_stem
0 0 55b85ea15a1536d46b7190ad6fff8ce7 2016-04-30 03:22:56 +0000 http://www.flipkart.com/elegance-polyester-mul... Elegance Polyester Multicolor Abstract Eyelet ... CRNEG7BKMFFYHQ8Z 1899.0 899.0 55b85ea15a1536d46b7190ad6fff8ce7.jpg False key features of elegance polyester multicolor ... No rating available Elegance {"Brand":"Elegance", "Designed For":"Door", "T... Home Furnishing Curtains & Accessories Curtains ['key', 'features', 'of', 'elegance', 'polyest... ['key', 'features', 'elegance', 'polyester', '... key featur eleg polyest multicolor abstract ey... key feature elegance polyester multicolor abst... 4 4 1 3 2 4 4 3 1 5 6 4 2 2
1 1 7b72c92c2f6c40268628ec5f14c6d590 2016-04-30 03:22:56 +0000 http://www.flipkart.com/sathiyas-cotton-bath-t... Sathiyas Cotton Bath Towel BTWEGFZHGBXPHZUH 600.0 449.0 7b72c92c2f6c40268628ec5f14c6d590.jpg False specifications of sathiyas cotton bath towel 3... No rating available Sathiyas {"Machine Washable":"Yes", "Material":"Cotton"... Baby Care Baby Bath & Skin Baby Bath Towels ['specifications', 'of', 'sathiyas', 'cotton',... ['specifications', 'sathiyas', 'cotton', 'bath... specif sathiya cotton bath towel bath towel re... specification sathiyas cotton bath towel bath ... 6 4 1 1 5 4 0 1 1 1 6 0 2 2
2 2 64d5d4a258243731dc7bbb1eef49ad74 2016-04-30 03:22:56 +0000 http://www.flipkart.com/eurospa-cotton-terry-f... Eurospa Cotton Terry Face Towel Set BTWEG6SHXTDB2A2Y NaN NaN 64d5d4a258243731dc7bbb1eef49ad74.jpg False key features of eurospa cotton terry face towe... No rating available Eurospa {"Material":"Cotton Terry", "Design":"SHUVAM",... Baby Care Baby Bath & Skin Baby Bath Towels ['key', 'features', 'of', 'eurospa', 'cotton',... ['key', 'features', 'eurospa', 'cotton', 'terr... key featur eurospa cotton terri face towel set... key feature eurospa cotton terry face towel se... 6 4 1 3 5 4 0 3 1 1 6 0 2 2
3 3 d4684dcdc759dd9cdf41504698d737d8 2016-06-20 08:49:52 +0000 http://www.flipkart.com/santosh-royal-fashion-... SANTOSH ROYAL FASHION Cotton Printed King size... BDSEJT9UQWHDUBH4 2699.0 1299.0 d4684dcdc759dd9cdf41504698d737d8.jpg False key features of santosh royal fashion cotton p... No rating available SANTOSH ROYAL FASHION {"Brand":"SANTOSH ROYAL FASHION", "Machine Was... Home Furnishing Bed Linen Bedsheets ['key', 'features', 'of', 'santosh', 'royal', ... ['key', 'features', 'santosh', 'royal', 'fashi... key featur santosh royal fashion cotton print ... key feature santosh royal fashion cotton print... 1 1 1 3 2 6 4 3 1 1 6 0 2 6
4 4 6325b6870c54cd47be6ebfbffa620ec7 2016-06-20 08:49:52 +0000 http://www.flipkart.com/jaipur-print-cotton-fl... Jaipur Print Cotton Floral King sized Double B... BDSEJTHNGWVGWWQU 2599.0 698.0 6325b6870c54cd47be6ebfbffa620ec7.jpg False key features of jaipur print cotton floral kin... No rating available Jaipur Print {"Machine Washable":"Yes", "Brand":"Jaipur Pri... Home Furnishing Bed Linen Bedsheets ['key', 'features', 'of', 'jaipur', 'print', '... ['key', 'features', 'jaipur', 'print', 'cotton... key featur jaipur print cotton floral king siz... key feature jaipur print cotton floral king si... 1 1 1 3 2 6 4 3 1 1 6 0 2 6

Traitement des données images

In [5]:
image_path = [data['image'][i] for i in range(data.shape[0])]
print(len(image_path))
1050
In [6]:
dir_name = 'Flipkart/Images/'
In [7]:
# Exemple sur une image 
image_exemple = data['image'].loc[5]
file_path = os.path.join(dir_name, image_exemple)
# Afficher l'image
img_exemple = cv2.imread(file_path)
plt.imshow(img_exemple)
Out[7]:
<matplotlib.image.AxesImage at 0x7fe8e72d3790>
In [8]:
# Exemple sur une image 
image_exemple = data['image'].loc[80]
file_path = os.path.join(dir_name, image_exemple)
# Afficher l'image
img_exemple = cv2.imread(file_path)
plt.imshow(img_exemple)
Out[8]:
<matplotlib.image.AxesImage at 0x7fe8e74371c0>
In [9]:
for name in data['product_category_1'].unique():
    plt.figure(figsize=(10,10))

    print(name)
    for i in range(5):
        plt.subplot(4, 5, i+1)
        filename = dir_name + list(data[data["product_category_1"] == name]['image'])[i+31]
        image = imread(filename)
        plt.imshow(image)
        plt.axis('off')
    plt.show()
Home Furnishing 
Baby Care 
Watches 
Home Decor & Festive Needs 
Kitchen & Dining 
Beauty and Personal Care 
Computers 

SIFT

L'algorithme SIFT permet d'extraire des features (ou points d'intérêt) de l'image et de calculer leurs descripteurs. Il permets de détecter et identifier les éléments similaires entre différentes images.

Keypoint Descriptor

In [10]:
image_exemple = data['image'].loc[8]
file_path = os.path.join(dir_name, image_exemple)
sift = cv2.SIFT_create(500)

image = cv2.imread(file_path)
print('Image originale')
plt.imshow(image)
plt.axis("off")
plt.show()

image = cv2.imread(file_path, 0) # convert in gray
print('Image gris')
plt.imshow(image, cmap='gray')
plt.axis("off")
plt.show()

image = cv2.GaussianBlur(image, (5,5), cv2.BORDER_DEFAULT)
print('Image flou')

plt.imshow(image, cmap='gray')
plt.axis("off")
plt.show()

image = cv2.resize(image, (255, 255), interpolation=cv2.INTER_AREA)
print('Image redimensionné')

plt.imshow(image, cmap='gray')
plt.axis("off")
plt.show()

image = cv2.equalizeHist(image)   # equalize image histogram
print('Image avec contrast')

plt.imshow(image, cmap='gray')
plt.axis("off")
plt.show()

clahe = cv2.createCLAHE()          # Equalization by CLAHE
image = clahe.apply(image)
print('Contraste adapté et egalisation des histogrammes')
plt.imshow(image, cmap='gray')
plt.axis("off")
plt.show()

kp, des = sift.detectAndCompute(image, None)
img=cv2.drawKeypoints(image,kp,image)
print('Image prétraitée et descripteurs')
plt.imshow(img)
plt.axis("off")
plt.show()


print("Descripteurs : ", des.shape)
print()
print(des)
Image originale
Image gris
Image flou
Image redimensionné
Image avec contrast
Contraste adapté et egalisation des histogrammes
Image prétraitée et descripteurs
Descripteurs :  (409, 128)

[[ 18.   6.   0. ...   0.   0.   5.]
 [ 23.   2.   0. ...   0.   0.  21.]
 [  0.   0.   0. ...  71.   7.   3.]
 ...
 [137.  71.   0. ...   0.   0.   1.]
 [ 23. 131.  50. ...   0.   0.   1.]
 [  0.  11.  69. ...   0.   0.   0.]]

Feature Matching

In [11]:
image_exemple1 = data['image'].loc[7]
image_exemple2 = data['image'].loc[15]

file_path1 = os.path.join(dir_name, image_exemple1)
file_path2 = os.path.join(dir_name, image_exemple2)

# read images
img1 = cv2.imread(file_path1)  
img2 = cv2.imread(file_path2) 

img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)

figure, ax = plt.subplots(1, 2, figsize=(16, 8))

ax[0].imshow(img1, cmap='gray')
ax[1].imshow(img2, cmap='gray')
Out[11]:
<matplotlib.image.AxesImage at 0x7fe8ef4bbf40>
In [12]:
#sift
sift = cv2.SIFT_create()

keypoints_1, descriptors_1 = sift.detectAndCompute(img1,None)
keypoints_2, descriptors_2 = sift.detectAndCompute(img2,None)

#feature matching
bf = cv2.BFMatcher(cv2.NORM_L1, crossCheck=True)

matches = bf.match(descriptors_1,descriptors_2)
matches = sorted(matches, key = lambda x:x.distance)

img3 = cv2.drawMatches(img1, keypoints_1, img2, keypoints_2, matches[:50], img2, flags=2)
plt.imshow(img3),plt.show()
Out[12]:
(<matplotlib.image.AxesImage at 0x7fe8d83453a0>, None)
In [13]:
print("Nombre de features communs aux deux images: {} features".format(len(matches)))
Nombre de features communs aux deux images: 242 features
In [14]:
# identification of key points and associated descriptors
sift_keypoints = []
sift = cv2.SIFT_create()
clahe = cv2.createCLAHE()

for image_num in range(len(image_path)) :
    if image_num%100 == 0 : print(image_num)
    image = cv2.imread(dir_name+image_path[image_num],0) # convert in gray
    image = cv2.GaussianBlur(image, (5,5), cv2.BORDER_DEFAULT)
    image = cv2.resize(image, (255, 255), interpolation=cv2.INTER_AREA)
    image = cv2.equalizeHist(image)   # equalize image histogram
    res = clahe.apply(image)
    kp, des = sift.detectAndCompute(res, None)
    sift_keypoints.append(des)
    
sift_keypoints_by_img = np.asarray(sift_keypoints, dtype='object')
sift_keypoints_all    = np.concatenate(sift_keypoints_by_img, axis=0)

print("Descripteurs : ", sift_keypoints_all.shape)
0
100
200
300
400
500
600
700
800
900
1000
Descripteurs :  (692877, 128)

Un descripteur est un vecteur qui décrit le voisinage de la feature à laquelle il est associé. Il est utilisé pour repérer les paires de features qui se ressemblent le plus dans deux images. Pour faciliter cette étape de matching, le descripteur doit présenter de nombreuses propriétés d'invariance (rotation, échelle, illumination).

Clustering

In [15]:
list_ari = []
def plot_kmeans_tsne(reduction, title, filename, colname):

    kmeans_tsne = KMeans(n_clusters=7, n_init=50, max_iter=200,init='k-means++', random_state=42).fit(reduction)
    labels_tsne = kmeans_tsne.labels_
    cl_tsne = pd.concat([reduction,pd.DataFrame({'tsne_clusters':labels_tsne})],axis=1)
    
    data[f'cluster {colname}'] = labels_tsne
    categories_predict = data[f'cluster {colname}']
    categories_true = data['product_category_1']
    adjusted_rand = metrics.adjusted_rand_score(categories_true, categories_predict)
    list_ari.append(adjusted_rand)
    print("\033[1mAdjusted Rand Index: %0.3f\033[0m" % adjusted_rand)
    
    fig = px.scatter(data, x=cl_tsne.iloc[:,0], y = cl_tsne.iloc[:,1], color=categories_true, title=f"Représentation selon les vraies classes {title}")
    
    fig1 = px.scatter(data, x = cl_tsne.iloc[:,0],y = cl_tsne.iloc[:,1], color=categories_predict, title = f"Représentation selon les clusters {title}")
    
    plotly.offline.plot(fig, filename=f'plots/{filename}.html')
    plotly.offline.plot(fig1, filename=f'plots/{filename}_cluster.html')

    return fig.show(), fig1.show()
In [16]:
from yellowbrick.cluster import KElbowVisualizer
# Instantiate the clustering model and visualizer
model = KMeans()
visualizer = KElbowVisualizer(
    model, k=(2,10), timings=True
)

visualizer.fit(sift_keypoints_all)  # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure
Out[16]:
<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
In [17]:
from sklearn import cluster, metrics

k = int(round(np.sqrt(len(sift_keypoints_all)),0))
print("Nombre de clusters estimés : ", k)
print("Création de",k, "clusters de descripteurs ...")

# Clustering
kmeans = cluster.MiniBatchKMeans(n_clusters=k, init_size=3*k,
                                 batch_size=2000, random_state=0)
kmeans.fit(sift_keypoints_all)
Nombre de clusters estimés :  832
Création de 832 clusters de descripteurs ...
Out[17]:
MiniBatchKMeans(batch_size=2000, init_size=2496, n_clusters=832, random_state=0)
In [18]:
def build_histogram(kmeans, des, image_num):
    res = kmeans.predict(des)
    hist = np.zeros(len(kmeans.cluster_centers_))
    nb_des=len(des)
    if nb_des==0 : print("problème histogramme image  : ", image_num)
    for i in res:
        hist[i] += 1.0/nb_des
    return hist
In [19]:
# Creation of a matrix of histograms
hist_vectors=[]

for i, image_desc in enumerate(sift_keypoints_by_img) :
    hist = build_histogram(kmeans, image_desc, i) #calculates the histogram
    hist_vectors.append(hist) #histogram is the feature vector

im_features_sift = np.asarray(hist_vectors)

PCA

In [20]:
def pca(vector):
    pca = PCA(n_components=0.95)
    ft_pca = pca.fit_transform(vector)
    
    return ft_pca
In [21]:
pca_sift = pca(im_features_sift)
In [22]:
tsne = TSNE(n_components=2, verbose=1, perplexity=80,n_iter=5000, learning_rate=200, random_state=42)
In [23]:
X_tsne_sift = tsne.fit_transform(pca_sift)

df_sift = pd.DataFrame(X_tsne_sift[:,0:2], columns=['tsne1', 'tsne2'])
print(df_sift.shape)
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.013s...
[t-SNE] Computed neighbors for 1050 samples in 0.945s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 0.015132
[t-SNE] KL divergence after 250 iterations with early exaggeration: 94.992477
[t-SNE] KL divergence after 1350 iterations: 1.481968
(1050, 2)
In [24]:
plot_kmeans_tsne(df_sift, "Clusters des descripteurs", "kmeans_sift", "sift")
Adjusted Rand Index: 0.046
Out[24]:
(None, None)
Le graphique montre que avec sift seulement, les categories sont très mal classifié
In [25]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster sift'] == x].index
             for x in data['cluster sift'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)

CNN

In [26]:
pathTrain = r'Flipkart/Images/Train/' 
pathTest = r'Flipkart/Images/Test/' 
pathValidation = r'Flipkart/Images/Validation/' 
In [27]:
if os.path.exists(pathTrain) == False:
    os.mkdir(pathTrain)

if os.path.exists(pathTest) == False:
    os.mkdir(pathTest)
    
if os.path.exists(pathValidation) == False:
    os.mkdir(pathValidation)
In [28]:
data['IDcatégorie'] = data['product_category_1'].copy()
map_categorie = {'Baby Care ': 0,
                 'Beauty and Personal Care ': 1,
                 'Computers ': 2,
                 'Home Decor & Festive Needs ': 3,
                 'Home Furnishing ': 4,
                 'Kitchen & Dining ': 5,
                 'Watches ': 6
                }

data['IDcatégorie'] = data['IDcatégorie'].map(map_categorie)
In [29]:
data.product_category_1.unique().tolist()
Out[29]:
['Home Furnishing ',
 'Baby Care ',
 'Watches ',
 'Home Decor & Festive Needs ',
 'Kitchen & Dining ',
 'Beauty and Personal Care ',
 'Computers ']
In [30]:
data.IDcatégorie.unique().tolist()
Out[30]:
[4, 0, 6, 3, 5, 1, 2]
In [31]:
X = data[['uniq_id', 'image', 'product_category_1', 'IDcatégorie']].copy()
Y = data[['uniq_id', 'product_category_1', 'IDcatégorie']]
In [32]:
listeCatégorie = ['Home Furnishing ', 'Baby Care ', 'Watches ',
                  'Home Decor & Festive Needs ', 'Kitchen & Dining ',
                  'Beauty and Personal Care ', 'Computers ']

Nous allons prendre 70% des images pour l'entraînement et 30% pour le test, soit 105 images pour l'entraînement et 45 pour le test et la validation.

In [33]:
for catégorie in listeCatégorie:
    path = pathTrain + catégorie +'/'
    if os.path.exists(path) == False:
        os.mkdir(pathTrain + catégorie +'/')
        listeImage = X[X['product_category_1'] == catégorie]['image'].values.tolist()[0:105]
    else:
        listeImage = X[X['product_category_1'] == catégorie]['image'].values.tolist()[0:105]
    
    for image in listeImage:
        shutil.copy(r'Flipkart/Images/' + image, pathTrain+catégorie+'/'+image)
In [34]:
for catégorie in listeCatégorie:
    path = pathTest+ catégorie +'/'
    if os.path.exists(path) == False:
        os.mkdir(pathTest + catégorie +'/')
        listeImage = X[X['product_category_1'] == catégorie]['image'].values.tolist()[105:150]
    else:
        listeImage = X[X['product_category_1'] == catégorie]['image'].values.tolist()[105:150]
    
    for image in listeImage:
        shutil.copy(r'Flipkart/Images/' + image, pathTest +catégorie+'/'+image)
In [35]:
for catégorie in listeCatégorie:
    path = pathValidation + catégorie +'/'
    if os.path.exists(path) == False:
        os.mkdir(pathValidation + catégorie +'/')
        listeImage = X[X['product_category_1'] == catégorie]['image'].values.tolist()[60:105]
    else:
        listeImage = X[X['product_category_1'] == catégorie]['image'].values.tolist()[60:105]
    
    for image in listeImage:
        shutil.copy(r'Flipkart/Images/' + image, pathValidation +catégorie+'/'+image)
In [36]:
# Load the normalized images
train_datagen = ImageDataGenerator(rescale = 1./255)
validation_datagen = ImageDataGenerator(rescale = 1./255)
Test_datagen = ImageDataGenerator(rescale = 1./255)


# Change the batchsize
train_batchsize = 100
val_batchsize = 10 
target_size=(224, 224)

# Data generator for training data
train_generator = train_datagen.flow_from_directory(
        pathTrain,
        target_size=target_size,
        batch_size=train_batchsize,
        class_mode='categorical') 

# Data generator for test data
validation_generator = validation_datagen.flow_from_directory(
        pathValidation,
        target_size=(224, 224),
        batch_size=val_batchsize,
        class_mode='categorical',
        shuffle=True)


# Data generator for validation data
test_generator = Test_datagen.flow_from_directory(
        pathTest,
        target_size = (224, 224),
        batch_size = 1,
        class_mode = None,
        shuffle = False)
Found 735 images belonging to 7 classes.
Found 315 images belonging to 7 classes.
Found 315 images belonging to 7 classes.

VGG-16

Version du réseau de neurones convolutif très connu appelé VGG-Net.

In [37]:
IMGSIZE       = 224    # Taille de l'image en input
EPOCH         = 22     # nombre d'epoch 
BATCH_SIZE    = 16     # traitement par batch d'images avant la descente de gradient
FREEZE_LAYERS = 15     # pour un VGG16 freeze de réapprentissage de certaines couches
TRAIN         = True   # Entrainement ou utilisation d'un réseau déjà entrainé
In [38]:
img = load_img(r'Flipkart/Images/3e2b2a04696f7d83a7835e9894d79df7.jpg')  # Charger l'image
plt.imshow(img)
Out[38]:
<matplotlib.image.AxesImage at 0x7fe8459abf70>
In [39]:
data.loc[data['image'] == '3e2b2a04696f7d83a7835e9894d79df7.jpg']
Out[39]:
Unnamed: 0 uniq_id crawl_timestamp product_url product_name pid retail_price discounted_price image is_FK_Advantage_product description product_rating brand product_specifications product_category_1 product_category_2 product_category_3 tokenized stopwords_removed porter_stemmed lemmatize cluster cvec_lem_pca_tsne cluster TFIDF_lem_PCA cluster CVec_lem_LDA cluster TFIDF_lem_LDA cluster CountVec_stem_PCA cluster TFIDF_stem_PCA cluster countVec_stem_LDA cluster TFIDF_stem_LDA cluster word2vec_lem cluster word2vec_stemmed cluster BERT_lem cluster BERT_stem cluster USE_lem cluster USE_stem cluster sift IDcatégorie
970 970 3e2b2a04696f7d83a7835e9894d79df7 2016-05-26 12:20:35 +0000 http://www.flipkart.com/bela-home-cotton-carto... Bela Home Cotton Cartoon Single Bedsheet BDSEGZCDGH2PU6YZ 599.0 319.0 3e2b2a04696f7d83a7835e9894d79df7.jpg False key features of bela home cotton cartoon singl... No rating available Bela Home {"Brand":"Bela Home", "Machine Washable":"Yes"... Baby Care Baby Bedding Baby Bedsheets ['key', 'features', 'of', 'bela', 'home', 'cot... ['key', 'features', 'bela', 'home', 'cotton', ... key featur bela home cotton cartoon singl beds... key feature bela home cotton cartoon single be... 1 1 1 3 2 6 4 3 1 1 6 0 2 6 1 0

Ce lit appartient à la catégorie Baby Care

In [40]:
img = img.resize((224, 224))
img = img_to_array(img)  # Convertir en tableau numpy
img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))  # Créer la collection d'images (un seul échantillon)

img = preprocess_input(img)  # Prétraiter l'image comme le veut VGG-16
In [41]:
model = VGG16()
In [42]:
y = model.predict(img)
In [43]:
print('Top 5 :', decode_predictions(y, top=5)[0])
Top 5 : [('n03131574', 'crib', 0.641943), ('n04033995', 'quilt', 0.16941057), ('n03938244', 'pillow', 0.07729893), ('n04344873', 'studio_couch', 0.055204608), ('n04550184', 'wardrobe', 0.026519408)]

VGG16 a bien reconnu que cette image contenait un oreiller un drap, une commode qui est juste. Mais vu que le jeu de données contient 7 classes spécifiques on va donc utiliser le transfer learning.

In [44]:
train_image_files = pathTrain
test_image_files = pathTest
IMSIZE = 224

def create_model():
    vgg = VGG16(input_shape=(IMSIZE, IMSIZE, 3), weights='imagenet', include_top=False)
 
    # Freeze existing VGG already trained weights
    for layer in vgg.layers:
        layer.trainable = False
     
    # Charger VGG-16 pré-entraîné sur ImageNet et sans les couches fully-connected
    model = VGG16(weights="imagenet", include_top=False, input_shape=(224, 224, 3))

    # Récupérer la sortie de ce réseau
    x = model.output
    x = GlobalMaxPooling2D()(x)
    x = Dense(4096,activation='relu')(x)
    x = Dense(4096,activation='relu')(x)

    # Ajouter la nouvelle couche fully-connected pour la classification de nos 7 classes
    predictions = Dense(7, activation='softmax')(x)

    # Définir le nouveau modèle
    model = Model(inputs=model.input, outputs=predictions)
     
    
    # Compiler le modèle  
    model.compile(loss="categorical_crossentropy",
                  optimizer=optimizers.SGD(learning_rate=0.0001, momentum=0.9),
                  metrics=['accuracy'])
     
    model.summary()
     
    return model
 
mymodel = create_model()
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_3 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0         
                                                                 
 block3_conv1 (Conv2D)       (None, 56, 56, 256)       295168    
                                                                 
 block3_conv2 (Conv2D)       (None, 56, 56, 256)       590080    
                                                                 
 block3_conv3 (Conv2D)       (None, 56, 56, 256)       590080    
                                                                 
 block3_pool (MaxPooling2D)  (None, 28, 28, 256)       0         
                                                                 
 block4_conv1 (Conv2D)       (None, 28, 28, 512)       1180160   
                                                                 
 block4_conv2 (Conv2D)       (None, 28, 28, 512)       2359808   
                                                                 
 block4_conv3 (Conv2D)       (None, 28, 28, 512)       2359808   
                                                                 
 block4_pool (MaxPooling2D)  (None, 14, 14, 512)       0         
                                                                 
 block5_conv1 (Conv2D)       (None, 14, 14, 512)       2359808   
                                                                 
 block5_conv2 (Conv2D)       (None, 14, 14, 512)       2359808   
                                                                 
 block5_conv3 (Conv2D)       (None, 14, 14, 512)       2359808   
                                                                 
 block5_pool (MaxPooling2D)  (None, 7, 7, 512)         0         
                                                                 
 global_max_pooling2d (Globa  (None, 512)              0         
 lMaxPooling2D)                                                  
                                                                 
 dense (Dense)               (None, 4096)              2101248   
                                                                 
 dense_1 (Dense)             (None, 4096)              16781312  
                                                                 
 dense_2 (Dense)             (None, 7)                 28679     
                                                                 
=================================================================
Total params: 33,625,927
Trainable params: 33,625,927
Non-trainable params: 0
_________________________________________________________________
In [45]:
history = mymodel.fit(
      train_generator,
      steps_per_epoch = train_generator.samples/train_generator.batch_size,
      epochs = 20,
      validation_data = validation_generator,
      validation_steps = validation_generator.samples/validation_generator.batch_size, verbose=1
    )
Epoch 1/20
7/7 [==============================] - 393s 53s/step - loss: 1.9734 - accuracy: 0.1374 - val_loss: 1.9504 - val_accuracy: 0.1714
Epoch 2/20
7/7 [==============================] - 392s 54s/step - loss: 1.9140 - accuracy: 0.2027 - val_loss: 1.8838 - val_accuracy: 0.2794
Epoch 3/20
7/7 [==============================] - 401s 55s/step - loss: 1.8434 - accuracy: 0.3320 - val_loss: 1.8169 - val_accuracy: 0.3619
Epoch 4/20
7/7 [==============================] - 402s 55s/step - loss: 1.7707 - accuracy: 0.4095 - val_loss: 1.7417 - val_accuracy: 0.4381
Epoch 5/20
7/7 [==============================] - 402s 55s/step - loss: 1.6852 - accuracy: 0.5061 - val_loss: 1.6551 - val_accuracy: 0.5302
Epoch 6/20
7/7 [==============================] - 403s 56s/step - loss: 1.5861 - accuracy: 0.6068 - val_loss: 1.5544 - val_accuracy: 0.6254
Epoch 7/20
7/7 [==============================] - 423s 59s/step - loss: 1.4770 - accuracy: 0.6599 - val_loss: 1.4357 - val_accuracy: 0.6825
Epoch 8/20
7/7 [==============================] - 417s 57s/step - loss: 1.3415 - accuracy: 0.7061 - val_loss: 1.2989 - val_accuracy: 0.7270
Epoch 9/20
7/7 [==============================] - 425s 64s/step - loss: 1.1927 - accuracy: 0.7306 - val_loss: 1.1486 - val_accuracy: 0.7302
Epoch 10/20
7/7 [==============================] - 396s 54s/step - loss: 1.0416 - accuracy: 0.7510 - val_loss: 0.9929 - val_accuracy: 0.7460
Epoch 11/20
7/7 [==============================] - 402s 55s/step - loss: 0.8963 - accuracy: 0.7714 - val_loss: 0.8626 - val_accuracy: 0.7556
Epoch 12/20
7/7 [==============================] - 426s 59s/step - loss: 0.7753 - accuracy: 0.7864 - val_loss: 0.7405 - val_accuracy: 0.7873
Epoch 13/20
7/7 [==============================] - 427s 59s/step - loss: 0.6724 - accuracy: 0.8014 - val_loss: 0.6552 - val_accuracy: 0.8032
Epoch 14/20
7/7 [==============================] - 430s 59s/step - loss: 0.5922 - accuracy: 0.8327 - val_loss: 0.5750 - val_accuracy: 0.8349
Epoch 15/20
7/7 [==============================] - 431s 59s/step - loss: 0.5217 - accuracy: 0.8544 - val_loss: 0.5218 - val_accuracy: 0.8508
Epoch 16/20
7/7 [==============================] - 428s 59s/step - loss: 0.4598 - accuracy: 0.8707 - val_loss: 0.4527 - val_accuracy: 0.8730
Epoch 17/20
7/7 [==============================] - 428s 64s/step - loss: 0.4049 - accuracy: 0.8884 - val_loss: 0.3951 - val_accuracy: 0.8984
Epoch 18/20
7/7 [==============================] - 429s 59s/step - loss: 0.3572 - accuracy: 0.9034 - val_loss: 0.3519 - val_accuracy: 0.9079
Epoch 19/20
7/7 [==============================] - 427s 59s/step - loss: 0.3171 - accuracy: 0.9129 - val_loss: 0.3105 - val_accuracy: 0.9302
Epoch 20/20
7/7 [==============================] - 428s 64s/step - loss: 0.2745 - accuracy: 0.9293 - val_loss: 0.2643 - val_accuracy: 0.9492
In [46]:
# Utility function for plotting of the model results

def visualize_results(history):
    # Plot the accuracy and loss curves
    acc = history.history["accuracy"]
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(len(acc))
    plt.plot(epochs, acc, 'b', label='Training acc')
    plt.plot(epochs, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.figure()
    plt.plot(epochs, loss, 'b', label='Training loss')
    plt.plot(epochs, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()

# Run the function to illustrate accuracy and loss
visualize_results(history)

Génération des prédictions

Maintenant que notre modèle est entrainé nous allons pouvoir l'évaluer et lui demander de faire les prédictions sur nos images de tests

In [73]:
mymodel.evaluate_generator(generator = validation_generator)
Out[73]:
[0.26426059007644653, 0.9492063522338867]
In [48]:
mymodel.evaluate_generator(generator = train_generator)
Out[48]:
[0.2464742213487625, 0.9414966106414795]

Nous pouvons voir un taux de bonne classification supérieur à 94%

In [49]:
# Affichage des classes du jeu d'entrainement
train_generator.class_indices
Out[49]:
{'Baby Care ': 0,
 'Beauty and Personal Care ': 1,
 'Computers ': 2,
 'Home Decor & Festive Needs ': 3,
 'Home Furnishing ': 4,
 'Kitchen & Dining ': 5,
 'Watches ': 6}
In [50]:
# Génération des prédictions
test_generator.reset()
pred = mymodel.predict_generator(test_generator, steps = test_generator.samples/test_generator.batch_size, verbose = 1)
315/315 [==============================] - 52s 165ms/step
In [68]:
predicted_classes = np.argmax(pred, axis=1)
labels = train_generator.class_indices
labels = dict((v,k) for k,v in labels.items())
predictions = [labels[k] for k in predicted_classes]
In [86]:
# Retrieve a batch of images from the test set
filenamesTest = test_generator.filepaths


for i in range(10):
    ax = plt.subplot(2, 5, i+1)
    print(filenamesTest[i+30])
    img = load_img(filenamesTest[i+30])  # Charger l'image
    plt.imshow(img)
    plt.title([predictions[i]])
    plt.axis("off")
Flipkart/Images/Test/Baby Care /a541b3aba326d7749b4c086c3cea9273.jpg
Flipkart/Images/Test/Baby Care /b1644f47c7dfa58f8c06677f2a27ee09.jpg
Flipkart/Images/Test/Baby Care /b454f9f449c9dff58b90113ba984ea98.jpg
Flipkart/Images/Test/Baby Care /b58309ed929c2bf7d6096c03667b654f.jpg
Flipkart/Images/Test/Baby Care /bdcc7e18ffed5390cd3107030992ffa6.jpg
Flipkart/Images/Test/Baby Care /be0f39341d771aac57084970f1ed6425.jpg
Flipkart/Images/Test/Baby Care /c3edc504d1b4f0ba6224fa53a43a7ad6.jpg
Flipkart/Images/Test/Baby Care /c44a5dc5b5ebe5b3e0535b7c2b7921e4.jpg
Flipkart/Images/Test/Baby Care /d627a270302d23eef773d05ad01fd03c.jpg
Flipkart/Images/Test/Baby Care /e347e7eca70e01badc3bdf6c154ac7c4.jpg
On peut voir qu'il y en a des produits qui ont été mal classifié mais c'est peut être dû à la similarité entre les produits des divers catégories.
In [87]:
# identification of key points and associated descriptors
import time, cv2

path = "Flipkart/Images/"
list_photos =  data['image'].values.tolist()

vgg_keypoints = []

temps1=time.time()

for image_num in range(len(list_photos)) :
    if image_num%50 == 0 : print(image_num)
    # load an image from file
    image = load_img(path+list_photos[image_num], target_size=(224, 224))
    # convert the image pixels to a numpy array
    image = img_to_array(image)
    #reduction bruit
    image = cv2.GaussianBlur(image, (5, 5), 0)
    # reshape data for the model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # prepare the image for the VGG model
    image = preprocess_input(image)
    features = mymodel.predict(image)
    vgg_keypoints.append(features)

vgg_keypoints_by_img = np.asarray(vgg_keypoints)
vgg_keypoints_all    = np.concatenate(vgg_keypoints_by_img, axis=0)

print()
print("Nombre de descripteurs VGG : ", vgg_keypoints_all.shape)

duration1=time.time()-temps1
print("temps de traitement VGG descriptor : ", "%15.2f" % duration1, "secondes")
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000

Nombre de descripteurs VGG :  (1050, 7)
temps de traitement VGG descriptor :           214.18 secondes
In [88]:
pca_vgg_lem = pca(vgg_keypoints_all)
In [89]:
X_tsne_vgg = tsne.fit_transform(pca_vgg_lem)

df_tsne_vgg = pd.DataFrame(X_tsne_vgg, columns=['tsne1', 'tsne2'])
print(df_tsne_vgg.shape)
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.003s...
[t-SNE] Computed neighbors for 1050 samples in 0.067s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 35.118217
[t-SNE] KL divergence after 1500 iterations: 0.111920
(1050, 2)
In [90]:
plot_kmeans_tsne(df_tsne_vgg, "Clusters des descripteurs VGG", "kmeans_vgg", "vgg")
Adjusted Rand Index: 0.470
Out[90]:
(None, None)
In [91]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster vgg'] == x].index
             for x in data['cluster vgg'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)
In [92]:
df_ari=pd.DataFrame([list_ari]
                    ,columns=['sift','vgg'],
                    index=['ARI_SCORE'])
In [93]:
df_ari.T.round(2).plot(kind="bar",figsize=(10,6))
plt.xlabel("Model")
plt.ylabel("ARI Score")
Out[93]:
Text(0, 0.5, 'ARI Score')
In [94]:
df1 = pd.read_csv("Flipkart/ari.csv", index_col=0)
In [95]:
df = df1.join(df_ari, how="inner")
In [96]:
df.T.round(2).plot(kind="bar",figsize=(10,6))
plt.xlabel("Model")
plt.ylabel("ARI Score")
Out[96]:
Text(0, 0.5, 'ARI Score')
In [97]:
df.to_csv("Flipkart/ari_im.csv")
In [98]:
im_features_df = pd.DataFrame(im_features_sift)
im_features_df.to_csv('Flipkart/im_features_sift.csv')
In [99]:
im_features_df.shape
Out[99]:
(1050, 832)
In [100]:
im_vgg_features_df = pd.DataFrame(vgg_keypoints_all)
im_vgg_features_df.to_csv('Flipkart/im_features_vgg.csv')
In [101]:
im_vgg_features_df.shape
Out[101]:
(1050, 7)
Le modèle Kmeans avec le CNN donne le meilleur résultat avec 0.470
In [ ]: